# author: Han Zhang
library(ggplot2)


c = read.csv('keywordCoverageSet_keyword_year.txt', stringsAsFactor = F, header = F, sep = " ")
names(c) <- c("num", "keyword","year", "prop")
c$year = as.factor(c$year)


ggplot(c, aes(x = num, y = prop, color = year)) + 
  geom_line(aes(year)) +
  geom_point(size = 0.2) +
  geom_vline(xintercept = 50, linetype = "dotdash")+
  scale_y_continuous(breaks = seq(0, 1, 0.05), limits = c(0, 1)) +
  scale_x_discrete(breaks = seq(0, 1000, 50), limits = c(0, 1000)) +
  scale_color_grey() +
  xlab ("Size of Keyword Dictionary K") +
  ylab ("Proportion of Wickedonna posts that contains words in K") +
  theme(axis.text.y= element_text(size = 16, colour = 'black'),
        axis.title.y=element_text(size = 14, colour = 'black')) +
  theme(axis.text.x= element_text(size = 16, colour = 'black'),
        axis.title.x=element_text(size = 15, colour = 'black')) +
  theme_bw()

ggsave("keyword_coverage_wickedonna_by_year.pdf", device = "pdf", width = 6, height = 5, units = "in")

